import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import cufflinks as cf
from tqdm import tqdm
import chart_studio
import re
import urllib.request
import os
chart_studio.tools.set_credentials_file(username='zhyiyang', api_key='bwSqRbNvLD1oZ8xYIiQF')
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()
Index = pd.read_excel("data/index.xlsx")
Index = Index.set_index("Time", drop=True)
Index.columns = ["prior_close", "open", "high", "low", "close", "volume", "amount", "change", "change_percent"]
Index
| prior_close | open | high | low | close | volume | amount | change | change_percent | |
|---|---|---|---|---|---|---|---|---|---|
| Time | |||||||||
| 2015-01-05 | 2581.57 | 2612.85 | 2678.44 | 2583.55 | 2649.64 | 18242556600 | 229763603407 | 68.07 | 2.6368 |
| 2015-01-06 | 2649.64 | 2616.29 | 2691.49 | 2591.83 | 2629.40 | 17343999300 | 220601814712 | -20.24 | -0.7638 |
| 2015-01-07 | 2629.40 | 2608.88 | 2668.84 | 2598.89 | 2635.23 | 12831772000 | 178386105612 | 5.83 | 0.2217 |
| 2015-01-08 | 2635.23 | 2647.82 | 2656.75 | 2551.82 | 2558.11 | 11836791500 | 157065780834 | -77.12 | -2.9267 |
| 2015-01-09 | 2558.11 | 2554.29 | 2692.61 | 2540.34 | 2569.02 | 15620951000 | 211005825866 | 10.91 | 0.4266 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2022-03-07 | 3062.38 | 3042.02 | 3042.02 | 2965.22 | 2974.68 | 4265843700 | 88830290514 | -87.71 | -2.8640 |
| 2022-03-08 | 2974.68 | 2983.05 | 3005.56 | 2920.97 | 2933.16 | 4105561100 | 88323419669 | -41.51 | -1.3955 |
| 2022-03-09 | 2933.16 | 2949.33 | 2965.57 | 2809.99 | 2907.18 | 4727191800 | 97616288978 | -25.99 | -0.8859 |
| 2022-03-10 | 2907.18 | 2969.49 | 2973.61 | 2931.09 | 2933.46 | 3422380000 | 83598077050 | 26.28 | 0.9041 |
| 2022-03-11 | 2933.46 | 2897.34 | 2944.25 | 2852.79 | 2935.68 | 3645774600 | 81836077738 | 2.22 | 0.0758 |
1749 rows × 9 columns
qf = cf.QuantFig(Index,title='SSE 50 Index',legend='top',name='GS')
qf.add_volume(name='Volume',up_color='green', down_color='red')
qf.add_macd(name="MACD")
qf.iplot()
First, we import all the data for individual stocks, these stocks contains all the stocks that appeared in the SSE 50 Index since year 2015. We can do this by getting all the stock in the SSE 50 Indexat the start of year 2015 and add the changed stocks into the list.
index_component_start = pd.read_excel("data/20150101-20150520.xlsx", sheet_name="行业")
stocks = index_component_start["代码"].to_list()
index_component_change = pd.read_excel("data/index_adjustment.xlsx")
stocks_ch = index_component_change["代码"].to_list()
index_component = list(set(stocks + stocks_ch))
len(index_component)
111
Now, we can get the data of each stock and do the data cleaning
def data_cleaning(data, filename):
# drop useless columns
data.drop(["名称"], axis=1, inplace=True)
data.drop("Unnamed: 0",axis=1, inplace=True)
# change column names
data.columns = ["time", "code", "close", "high", "low", "open", "pre-close",
"change_amount", "change_rate", "turnover_rate", "volume",
"amount", "market_value", "l_market_value"]
# deal with the missing value because of the stop trading of individual stocks
# the close, open, high, low during the stop trading is same as the previous
for i in ["close", "open", "high", "low"]:
data[i].replace(0, np.nan, inplace=True)
data[i] = data[i].fillna(method='ffill')
# change amount and change rate during the stop trading time is 0
data["turnover_rate"].replace(np.nan, 0, inplace=True)
data['change_amount'].replace(np.nan, 0, inplace=True)
data['change_rate'].replace(np.nan, 0, inplace=True)
# other variables remains 0
data.to_csv(f"{filename}.csv", index=False)
for i in tqdm(index_component):
data = pd.read_csv(f"stocks/sh{i[:6]}.csv")
data_cleaning(data, i[:6])
100%|██████████| 111/111 [00:02<00:00, 40.83it/s]
Now we took 3 of them and check.
for i in tqdm(index_component[:1]):
data = pd.read_csv(f"{i[:6]}.csv")
data = data.set_index("time", drop=True)
qf = cf.QuantFig(data,title=f'{i}',legend='top',name='GS')
qf.add_volume(name='Volume',up_color='green', down_color='red')
qf.add_macd(name="MACD")
qf.iplot()
100%|██████████| 1/1 [00:00<00:00, 21.81it/s]
for i in tqdm(index_component[1:2]):
data = pd.read_csv(f"{i[:6]}.csv")
data = data.set_index("time", drop=True)
qf = cf.QuantFig(data,title=f'{i}',legend='top',name='GS')
qf.add_volume(name='Volume',up_color='green', down_color='red')
qf.add_macd(name="MACD")
qf.iplot()
100%|██████████| 1/1 [00:00<00:00, 111.47it/s]
for i in tqdm(index_component[2:3]):
data = pd.read_csv(f"{i[:6]}.csv")
data = data.set_index("time", drop=True)
qf = cf.QuantFig(data,title=f'{i}',legend='top',name='GS')
qf.add_volume(name='Volume',up_color='green', down_color='red')
qf.add_macd(name="MACD")
qf.iplot()
100%|██████████| 1/1 [00:00<00:00, 100.28it/s]
Now, let's double check there is missing values in the whole 111 stock datasets.
def not_null_count(column):
"""
function to check if there is missing values
"""
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
missing_number = 0
for i in tqdm(index_component):
data = pd.read_csv(f"{i[:6]}.csv")
data = data.set_index("time", drop=True)
column_null_count = data.apply(not_null_count)
missing_number += column_null_count.sum()
if column_null_count.sum() > 0:
print(i)
print(missing_number)
100%|██████████| 111/111 [00:00<00:00, 133.13it/s]
0